# 这部分代码主要是将原来的数据集划分成训练集和验证集
import pandas as pd
from sklearn.model_selection import train_test_split

# 读取CSV文件
df = pd.read_csv('train_set.csv', sep='\t')

# 分割数据集为训练集和测试集
train_df, test_df = train_test_split(df, test_size=0.1, stratify=df['label'], random_state=2025)

# 保存处理后的数据到新的CSV文件
output_train_file = 'train.csv'
output_test_file = 'valid.csv'

train_df.to_csv(output_train_file, sep='\t', index=False)
test_df.to_csv(output_test_file, sep='\t', index=False)

print(f"Processed training data saved to {output_train_file}")
print(f"Processed testing data saved to {output_test_file}")
